home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
C/C++ Users Group Library 1996 July
/
C-C++ Users Group Library July 1996.iso
/
vol_100
/
167_01
/
mtp.c
< prev
next >
Wrap
Text File
|
1985-11-13
|
20KB
|
792 lines
/*
HEADER: CUG
TITLE: MTP.C - Macro Text Processor;
VERSION: 0.05;
DATE: 09/13/86;
DESCRIPTION: "MTP is a Macro Text Processor that performs
macro expansions of specified text patterns in
a file. Based on the deterministic finite state
automaton of UNIX's fgrep utility, it has the
distinction of not requiring a symbol table for
the list of macro expansions. The text patterns
are identified without backtracking, and need not
consist of one alphanumeric 'word' as with most
other macro processors.";
KEYWORDS: macro,text,fgrep,filter;
SYSTEM: ANY;
FILENAME: MTP.C;
WARNINGS: "As you can see from the version number, MTP is
very much in the developmental stage. This is not
to say it is bug-ridden. What is presented here
should be free of bugs. It does however have very
few options at present. These will be added in
future versions, based on user (yes - YOU!)
feedback. It is being released as an example of
what you can do with the Aho-Corasick algorithm
other than FGREP (see FGREP.DOC)";
CRC: xxxx
SEE-ALSO: FGREP.DOC,FGREP.C;
AUTHORS: Ian Ashdown - byHeart Software;
COMPILERS: ANY;
REFERENCES: ;
ENDREF
*/
/*-------------------------------------------------------------*/
/* MTP.C - Macro Text Processor
*
* Version 0.05 September 13th, 1986
*
* Copyright 1986: Ian Ashdown
* byHeart Software
* 620 Ballantree Road
* West Vancouver, B.C.
* Canada V7S 1W3
* (604) 922-6148
*
* This program may be copied for personal, non-commercial use
* only, provided that the above copyright notice is included in
* all copies of the source code. Copying for any other use
* without previously obtaining the written permission of the
* author is prohibited.
*
* Notes:
*
* The algorithm used in this program constructs a deterministic
* finite state automaton (FSA) for pattern matching from the sub-
* strings, then uses the FSA to process the text string in one
* pass. The time taken to construct the FSA is proportional to
* the sum of the lengths of the the substrings. The number of
* state transitions made by the FSA in processing the text
* string is independent of the number of substrings.
*
* Algorithm Source:
*
* "Efficient String Matching: An Aid to Bibliographic Search"
* Alfred V. Aho & Margaret J. Corasick
* Communications of the ACM
* pp. 333 - 340, Vol. 18 No. 6 (June '75)
*
* USAGE: mtp [-n] string_file <files>
*
* where:
*
* -n Substitutions are not nested. That is, once a
* translation string has been substituted for a
* source string, the substitution itself is not
* scanned for another string to substitute.
*
* string_file
*
* is a text file of newline-separated
* source-translation strings. Each line of the file
* comprises one string, and consists of a source
* string, followed by one or more '\t' (tab)
* characters, and a translation string. Source
* strings may not contain '\t' or '\0' (NULL)
* characters. Translation strings may not contain
* '\0' characters. Empty lines consisting of only a
* newline ('\n') are permitted. Example:
*
* Replace me\twith me!
*
* file is any text file consisting of newline-separated
* strings.
*
* DIAGNOSTICS:
*
* Exit status is 0 unless error was encountered, in which case
* an exit status of 2 is returned.
*
* BUGS: Lines are limited to 256 characters.
*/
/*** Include Files ***/
#include <stdio.h>
#include <ctype.h>
/*** Definitions ***/
#define TRUE 1
#define FALSE 0
#define MAX_LINE 257 /* Maximum number of characters */
/* per line plus NULL delimiter */
/* The following definition applies to the Lattice C compiler
* (Version 2.15) for MS-DOS. The Lattice library does not have
* the Unix function "index()". However, it does have an
* equivalent function called "stpchr()".
*/
/* #define index stpchr */ /* Remove comment delimiters for */
/* Lattice C version 2.15 */
#define CMD_ERR 0 /* Error codes */
#define OPT_ERR 1
#define INP_ERR 2
#define STR_ERR 3
#define MEM_ERR 4
/*** Typedefs ***/
typedef int BOOL; /* Boolean flag */
/* Some C compilers, including Lattice C, do not support the
* "void" data type. Remove the following comment delimiters for
* such compilers.
*/
/* typedef int void; */
/*** Data Structures ***/
/* Queue element */
typedef struct queue
{
struct state_el *st_ptr;
struct queue *next_el;
} QUEUE;
/* Transition element */
typedef struct transition
{
char lchar; /* Transition character */
struct state_el *nextst_ptr; /* Transition state pointer */
struct transition *next_el;
} TRANSITION;
/* FSA state element */
typedef struct state_el
{
TRANSITION *go_ls; /* Pointer to head of "go" list */
TRANSITION *mv_ls; /* Pointer to head of "move" list */
struct state_el *fail_state; /* "failure" transition state */
char *out_str; /* Terminal state message (if any) */
int src_length; /* Source substring length */
int trn_length; /* Translation substring length */
} FSA;
/*** Global Variables and Structures ***/
/* Dummy "failure" state */
FSA FAIL_STATE;
/* Define a separate data structure for State 0 of the FSA to
* speed processing of the input while the FSA is in that state.
* Since the Aho-Corasick algorithm only defines "go" transitions
* for this state (one for each valid input character) and no
* "failure" transitions or output messages, only an array of
* "go" transition state numbers is needed. The array is accessed
* directly, using the input character as the index.
*/
FSA *MZ[128]; /* State 0 of FSA */
/* Command-line option flags */
BOOL nflag = FALSE; /* Substitution nesting disabled if TRUE */
/*** Main Body of Program ***/
int main(argc,argv)
int argc;
char **argv;
{
char *temp;
void proc_file(),
bd_go(),
bd_move(),
error();
/* Check for minimum number of command line arguments */
if(argc < 2)
error(CMD_ERR,NULL);
/* Parse the command line for user-selected options */
while(--argc && (*++argv)[0] == '-')
for(temp = argv[0]+1; *temp != '\0'; temp++)
switch(toupper(*temp))
{
case 'N':
nflag = TRUE;
break;
default:
error(OPT_ERR,NULL);
}
/* Check for string file argument */
if(!argc)
error(CMD_ERR,NULL);
/* Build the "go" transitions. */
bd_go(*argv++);
argc--;
/* Build the "failure" and "move" transitions */
bd_move();
/* Process each of the input files if not "stdin". */
if(!argc)
proc_file(NULL);
else
while(argc--)
proc_file(*argv++);
/* Return 0 to the parent process to indicate no errors. */
exit(0);
}
/*** Functions and Procedures ***/
/* PROC_FILE() - Run the FSA on the input file "in_file" */
void proc_file(in_file)
char *in_file;
{
char buffer[2*MAX_LINE], /* Input string buffer */
*nl,
*index(),
*fgets();
FILE *in_fd,
*fopen();
void run_fsa(),
error();
if(in_file != NULL) /* A file was specified as the input */
{
if(!(in_fd = fopen(in_file,"r")))
error(INP_ERR,in_file);
}
else
in_fd = stdin;
/* Read in a line at a time for processing */
while(fgets(buffer,MAX_LINE,in_fd))
{
if(nl = index(buffer,'\n')) /* Remove newline */
*nl = '\0';
run_fsa(buffer); /* Run the FSA on string "buffer" */
puts(buffer); /* Output the translated string */
}
if(in_file != NULL)
fclose(in_fd);
}
/* RUN_FSA() - Run the finite state automaton with string
* "target" as input.
*/
void run_fsa(target)
register char *target;
{
register FSA *st_ptr;
char tail[MAX_LINE];
FSA *go(),
*move();
st_ptr = NULL; /* Initialize FSA */
/* Process the next input character in the target string */
while(*target)
{
st_ptr = move(st_ptr,*target);
/* Substitute translation string for sou